Experiment Summary

samples_1 <- paste0("Eickelberg_", 
                  c("B6B6_1",
                    "B6B6_2",
                    "B6B6_3",
                    "HLAB6_2", 
                    "HLAB6_3"))

samples_2 <- c(
  "1_B_65_11152018",
  "1_B6_B6_11132018",
  "2_HLA_5_11152018")

samples_3 <- c(
  "NaiveB6",
  "B6_B6_171501",
  "HLA_B6_163107",
  "HLA_B6_18467"
)

samples <- c(samples_1, samples_2, samples_3)

sample_paths <- file.path(data_dir,
                          samples,
                          "outs", 
                          "filtered_feature_bc_matrix")

names(sample_paths) <- c(str_remove(samples_1, "Eickelberg_"),
                         "B6B6_5",
                         "B6B6_4",
                         "HLAB6_5",
                         "Naive_B6",
                         "B6B6_6",
                         "HLAB6_6",
                         "HLAB6_7")
mat <- Read10X(sample_paths)

General QC for library prep

metrics_paths <- file.path(data_dir,
                          samples,
                          "outs", 
                          "metrics_summary.csv")

names(metrics_paths) <- str_remove(samples, "Eickelberg_")

mapping_dat <- map_dfr(metrics_paths, read_csv, .id = "sample")
## Parsed with column specification:
## cols(
##   `Estimated Number of Cells` = col_number(),
##   `Mean Reads per Cell` = col_number(),
##   `Median Genes per Cell` = col_double(),
##   `Number of Reads` = col_number(),
##   `Valid Barcodes` = col_character(),
##   `Sequencing Saturation` = col_character(),
##   `Q30 Bases in Barcode` = col_character(),
##   `Q30 Bases in RNA Read` = col_character(),
##   `Q30 Bases in UMI` = col_character(),
##   `Reads Mapped to Genome` = col_character(),
##   `Reads Mapped Confidently to Genome` = col_character(),
##   `Reads Mapped Confidently to Intergenic Regions` = col_character(),
##   `Reads Mapped Confidently to Intronic Regions` = col_character(),
##   `Reads Mapped Confidently to Exonic Regions` = col_character(),
##   `Reads Mapped Confidently to Transcriptome` = col_character(),
##   `Reads Mapped Antisense to Gene` = col_character(),
##   `Fraction Reads in Cells` = col_character(),
##   `Total Genes Detected` = col_number(),
##   `Median UMI Counts per Cell` = col_number()
## )
## Parsed with column specification:
## cols(
##   `Estimated Number of Cells` = col_number(),
##   `Mean Reads per Cell` = col_number(),
##   `Median Genes per Cell` = col_double(),
##   `Number of Reads` = col_number(),
##   `Valid Barcodes` = col_character(),
##   `Sequencing Saturation` = col_character(),
##   `Q30 Bases in Barcode` = col_character(),
##   `Q30 Bases in RNA Read` = col_character(),
##   `Q30 Bases in UMI` = col_character(),
##   `Reads Mapped to Genome` = col_character(),
##   `Reads Mapped Confidently to Genome` = col_character(),
##   `Reads Mapped Confidently to Intergenic Regions` = col_character(),
##   `Reads Mapped Confidently to Intronic Regions` = col_character(),
##   `Reads Mapped Confidently to Exonic Regions` = col_character(),
##   `Reads Mapped Confidently to Transcriptome` = col_character(),
##   `Reads Mapped Antisense to Gene` = col_character(),
##   `Fraction Reads in Cells` = col_character(),
##   `Total Genes Detected` = col_number(),
##   `Median UMI Counts per Cell` = col_number()
## )
## Parsed with column specification:
## cols(
##   `Estimated Number of Cells` = col_number(),
##   `Mean Reads per Cell` = col_number(),
##   `Median Genes per Cell` = col_double(),
##   `Number of Reads` = col_number(),
##   `Valid Barcodes` = col_character(),
##   `Sequencing Saturation` = col_character(),
##   `Q30 Bases in Barcode` = col_character(),
##   `Q30 Bases in RNA Read` = col_character(),
##   `Q30 Bases in UMI` = col_character(),
##   `Reads Mapped to Genome` = col_character(),
##   `Reads Mapped Confidently to Genome` = col_character(),
##   `Reads Mapped Confidently to Intergenic Regions` = col_character(),
##   `Reads Mapped Confidently to Intronic Regions` = col_character(),
##   `Reads Mapped Confidently to Exonic Regions` = col_character(),
##   `Reads Mapped Confidently to Transcriptome` = col_character(),
##   `Reads Mapped Antisense to Gene` = col_character(),
##   `Fraction Reads in Cells` = col_character(),
##   `Total Genes Detected` = col_number(),
##   `Median UMI Counts per Cell` = col_number()
## )
## Parsed with column specification:
## cols(
##   `Estimated Number of Cells` = col_number(),
##   `Mean Reads per Cell` = col_number(),
##   `Median Genes per Cell` = col_double(),
##   `Number of Reads` = col_number(),
##   `Valid Barcodes` = col_character(),
##   `Sequencing Saturation` = col_character(),
##   `Q30 Bases in Barcode` = col_character(),
##   `Q30 Bases in RNA Read` = col_character(),
##   `Q30 Bases in UMI` = col_character(),
##   `Reads Mapped to Genome` = col_character(),
##   `Reads Mapped Confidently to Genome` = col_character(),
##   `Reads Mapped Confidently to Intergenic Regions` = col_character(),
##   `Reads Mapped Confidently to Intronic Regions` = col_character(),
##   `Reads Mapped Confidently to Exonic Regions` = col_character(),
##   `Reads Mapped Confidently to Transcriptome` = col_character(),
##   `Reads Mapped Antisense to Gene` = col_character(),
##   `Fraction Reads in Cells` = col_character(),
##   `Total Genes Detected` = col_number(),
##   `Median UMI Counts per Cell` = col_number()
## )
## Parsed with column specification:
## cols(
##   `Estimated Number of Cells` = col_number(),
##   `Mean Reads per Cell` = col_number(),
##   `Median Genes per Cell` = col_double(),
##   `Number of Reads` = col_number(),
##   `Valid Barcodes` = col_character(),
##   `Sequencing Saturation` = col_character(),
##   `Q30 Bases in Barcode` = col_character(),
##   `Q30 Bases in RNA Read` = col_character(),
##   `Q30 Bases in UMI` = col_character(),
##   `Reads Mapped to Genome` = col_character(),
##   `Reads Mapped Confidently to Genome` = col_character(),
##   `Reads Mapped Confidently to Intergenic Regions` = col_character(),
##   `Reads Mapped Confidently to Intronic Regions` = col_character(),
##   `Reads Mapped Confidently to Exonic Regions` = col_character(),
##   `Reads Mapped Confidently to Transcriptome` = col_character(),
##   `Reads Mapped Antisense to Gene` = col_character(),
##   `Fraction Reads in Cells` = col_character(),
##   `Total Genes Detected` = col_number(),
##   `Median UMI Counts per Cell` = col_number()
## )
## Parsed with column specification:
## cols(
##   `Estimated Number of Cells` = col_number(),
##   `Mean Reads per Cell` = col_number(),
##   `Median Genes per Cell` = col_double(),
##   `Number of Reads` = col_number(),
##   `Valid Barcodes` = col_character(),
##   `Sequencing Saturation` = col_character(),
##   `Q30 Bases in Barcode` = col_character(),
##   `Q30 Bases in RNA Read` = col_character(),
##   `Q30 Bases in UMI` = col_character(),
##   `Reads Mapped to Genome` = col_character(),
##   `Reads Mapped Confidently to Genome` = col_character(),
##   `Reads Mapped Confidently to Intergenic Regions` = col_character(),
##   `Reads Mapped Confidently to Intronic Regions` = col_character(),
##   `Reads Mapped Confidently to Exonic Regions` = col_character(),
##   `Reads Mapped Confidently to Transcriptome` = col_character(),
##   `Reads Mapped Antisense to Gene` = col_character(),
##   `Fraction Reads in Cells` = col_character(),
##   `Total Genes Detected` = col_number(),
##   `Median UMI Counts per Cell` = col_number()
## )
## Parsed with column specification:
## cols(
##   `Estimated Number of Cells` = col_number(),
##   `Mean Reads per Cell` = col_number(),
##   `Median Genes per Cell` = col_double(),
##   `Number of Reads` = col_number(),
##   `Valid Barcodes` = col_character(),
##   `Sequencing Saturation` = col_character(),
##   `Q30 Bases in Barcode` = col_character(),
##   `Q30 Bases in RNA Read` = col_character(),
##   `Q30 Bases in UMI` = col_character(),
##   `Reads Mapped to Genome` = col_character(),
##   `Reads Mapped Confidently to Genome` = col_character(),
##   `Reads Mapped Confidently to Intergenic Regions` = col_character(),
##   `Reads Mapped Confidently to Intronic Regions` = col_character(),
##   `Reads Mapped Confidently to Exonic Regions` = col_character(),
##   `Reads Mapped Confidently to Transcriptome` = col_character(),
##   `Reads Mapped Antisense to Gene` = col_character(),
##   `Fraction Reads in Cells` = col_character(),
##   `Total Genes Detected` = col_number(),
##   `Median UMI Counts per Cell` = col_number()
## )
## Parsed with column specification:
## cols(
##   `Estimated Number of Cells` = col_number(),
##   `Mean Reads per Cell` = col_number(),
##   `Median Genes per Cell` = col_double(),
##   `Number of Reads` = col_number(),
##   `Valid Barcodes` = col_character(),
##   `Sequencing Saturation` = col_character(),
##   `Q30 Bases in Barcode` = col_character(),
##   `Q30 Bases in RNA Read` = col_character(),
##   `Q30 Bases in UMI` = col_character(),
##   `Reads Mapped to Genome` = col_character(),
##   `Reads Mapped Confidently to Genome` = col_character(),
##   `Reads Mapped Confidently to Intergenic Regions` = col_character(),
##   `Reads Mapped Confidently to Intronic Regions` = col_character(),
##   `Reads Mapped Confidently to Exonic Regions` = col_character(),
##   `Reads Mapped Confidently to Transcriptome` = col_character(),
##   `Reads Mapped Antisense to Gene` = col_character(),
##   `Fraction Reads in Cells` = col_character(),
##   `Total Genes Detected` = col_number(),
##   `Median UMI Counts per Cell` = col_number()
## )
## Parsed with column specification:
## cols(
##   .default = col_character(),
##   `Estimated Number of Cells` = col_number(),
##   `Mean Reads per Cell` = col_number(),
##   `Median Genes per Cell` = col_number(),
##   `Number of Reads` = col_number(),
##   `Total Genes Detected` = col_number(),
##   `Median UMI Counts per Cell` = col_number()
## )
## See spec(...) for full column specifications.
## Parsed with column specification:
## cols(
##   .default = col_character(),
##   `Estimated Number of Cells` = col_number(),
##   `Mean Reads per Cell` = col_number(),
##   `Median Genes per Cell` = col_number(),
##   `Number of Reads` = col_number(),
##   `Total Genes Detected` = col_number(),
##   `Median UMI Counts per Cell` = col_number()
## )
## See spec(...) for full column specifications.
## Parsed with column specification:
## cols(
##   .default = col_character(),
##   `Estimated Number of Cells` = col_number(),
##   `Mean Reads per Cell` = col_number(),
##   `Median Genes per Cell` = col_number(),
##   `Number of Reads` = col_number(),
##   `Total Genes Detected` = col_number(),
##   `Median UMI Counts per Cell` = col_number()
## )
## See spec(...) for full column specifications.
## Parsed with column specification:
## cols(
##   .default = col_character(),
##   `Estimated Number of Cells` = col_number(),
##   `Mean Reads per Cell` = col_number(),
##   `Median Genes per Cell` = col_number(),
##   `Number of Reads` = col_number(),
##   `Total Genes Detected` = col_number(),
##   `Median UMI Counts per Cell` = col_number()
## )
## See spec(...) for full column specifications.
clean_up_metadata <- function(metrics_summary) {
  metrics_summary <- mutate_all(metrics_summary, str_remove, "%$")
  metrics_summary <- mutate_at(metrics_summary, .vars= 2:ncol(metrics_summary), as.numeric)
  metrics_summary
}

mapping_dat <- clean_up_metadata(mapping_dat)

metrics <- colnames(mapping_dat)[2:ncol(mapping_dat)]

mapping_dat <- mapping_dat %>% 
  gather(metric, value, -sample) %>%
  mutate(sample = factor(sample, levels = unique(sample)))

p <- map(metrics, 
    function(x) {
    filter(mapping_dat, metric == x) %>% 
          ggplot(aes(sample, value)) +
            geom_point(aes(color = sample)) +
        scale_color_brewer(palette = "Paired") + 
        labs(y = x, 
             x = "") +
        theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))
}) 
for(i in seq_along(p)){
  cat('\n### ', metrics[i], '\n')
  print(p[i])
  cat('\n')
}

Estimated Number of Cells

[[1]]

Mean Reads per Cell

[[1]]

Median Genes per Cell

[[1]]

Number of Reads

[[1]]

Valid Barcodes

[[1]]

Sequencing Saturation

[[1]]

Q30 Bases in Barcode

[[1]]

Q30 Bases in RNA Read

[[1]]

Q30 Bases in UMI

[[1]]

Reads Mapped to Genome

[[1]]

Reads Mapped Confidently to Genome

[[1]]

Reads Mapped Confidently to Intergenic Regions

[[1]]

Reads Mapped Confidently to Intronic Regions

[[1]]

Reads Mapped Confidently to Exonic Regions

[[1]]

Reads Mapped Confidently to Transcriptome

[[1]]

Reads Mapped Antisense to Gene

[[1]]

Fraction Reads in Cells

[[1]]

Total Genes Detected

[[1]]

Median UMI Counts per Cell

[[1]]

Q30 Bases in RNA Read 2

[[1]]

Preprocessing

Rerun preprocessing scaling, PCA, UMAP, etc.

so <- CreateSeuratObject(
  mat,
  min.cells = 3,
  min.features = 200,
  names.delim = "_",
  names.field = c(1,2)
)

so <- PercentageFeatureSet(so, 
                             pattern = "^mt-", 
                             col.name = "percent.mt")

so@meta.data <- so@meta.data %>% 
  tibble::rownames_to_column("cell") %>% 
  mutate(condition = str_split(orig.ident, "_", simplify = TRUE) %>% .[, 1] ) %>% 
  as.data.frame() %>% 
  tibble::column_to_rownames("cell")


dir.create(object_dir, showWarnings = FALSE)

saveRDS(so, file.path(object_dir, "unfiltered.rds"))
so <- readRDS(file.path(object_dir, "unfiltered.rds"))

Percent Mitochondrial UMIs

plot_violin(so@meta.data, 
            "orig.ident",
            "percent.mt",
            .fill = "condition") +
  labs(x = "", y = "Percent UMIs from Mitochondria")

# of genes detected

plot_violin(so@meta.data,
            "orig.ident", 
            "nFeature_RNA",
            .fill = "condition") +
  labs(x = "", y = "# of genes per cell")

# of UMIs detected

plot_violin(so@meta.data, 
            "orig.ident",
            "nCount_RNA", 
            .fill = "condition") +
  labs(x = "", y = "# of UMIs") 

Table of mitochondrial proportions per sample

so@meta.data %>% 
  group_by(orig.ident) %>% 
  summarize(median_percent_mito = median(percent.mt), 
            mean_percent_mito = mean(percent.mt)) %>% 
  arrange(desc(median_percent_mito))

Relationship between UMIs and % mitochondria

All samples

sample_names <- as.character(unique(so@meta.data$orig.ident))
per_sample <- map(sample_names, ~filter(so@meta.data, 
                                        orig.ident == .x))
p <- list()
for(i in seq_along(per_sample)){
  .col <- discrete_palette_default[i]
  p[[i]] <- ggplot(per_sample[[i]], aes(nCount_RNA, percent.mt)) +
        geom_point(aes(color = condition)) +
        scale_color_manual(values = .col)
}

plot_grid(plotlist= p, nrow = 4, ncol = 3)

for(i in seq_along(per_sample)){
  .col <- discrete_palette_default[i]
  cat('\n### ', sample_names[i], '\n')
  p <- ggplot(per_sample[[i]], aes(nCount_RNA, percent.mt)) +
        geom_point(aes(color = condition)) +
        scale_color_manual(values = .col)
  print(p)
  cat('\n')
}

B6B6_1

B6B6_2

B6B6_3

HLAB6_2

HLAB6_3

B6B6_5

B6B6_4

HLAB6_5

Naive_B6

B6B6_6

HLAB6_6

HLAB6_7

Filter cells and samples.

No filtering applied at this point.

so <- readRDS(file.path(object_dir, "unfiltered.rds"))

Annotate samples and assign colors to each sample.

annots <- so@meta.data %>% 
  select(orig.ident, condition) %>% 
  unique() %>%
  arrange(condition)

subgroup_order <- c(
  "B6B6",
  "HLAB6",
  "Naive"
)

annots <- mutate(annots, condition = factor(condition, levels = subgroup_order))

sample_order <- annots$orig.ident 

so@meta.data$orig.ident <- factor(so@meta.data$orig.ident, levels = sample_order)
so@meta.data$condition <- factor(so@meta.data$condition, levels = subgroup_order)

Normalize and embed into 2D with UMAP

so <- NormalizeData(so)

so <- FindVariableFeatures(
  so,
  selection.method = "vst",
  nfeatures = 2000,
  verbose = FALSE
)

so <- ScaleData(so, verbose = TRUE)

so <- RunPCA(so, 
             npcs = 100, 
             verbose = FALSE)

ElbowPlot(so, ndims = 100)

# make graphs and use graphs for UMAP
so <- FindNeighbors(so, 
                      reduction = "pca", 
                      dims = 1:30, 
                      k.param = 20L)


so <- RunUMAP(so, 
              graph = "RNA_snn", 
              min.dist = 0.3)

res <- c(0.05, 0.075, 0.1, 0.3, 0.5)
so <- FindClusters(so, 
                   resolution = res)

map(str_c("RNA_snn_res.", res), 
    ~plot_umap(so, .x)) %>% 
  plot_grid(plotlist = .,
            nrow = 3, 
            ncol = 2)
so$coarse_clusters <- so$RNA_snn_res.0.1

Idents(so) <- "coarse_clusters"

plot_umap(so, "coarse_clusters")
saveRDS(so, file.path(object_dir, "so.rds"))
so <- readRDS(file.path(object_dir, "so.rds"))

plot_umap(so, "condition", .col = palette_OkabeIto)

plot_umap(so, "coarse_clusters", label_text = TRUE, label_col = "black")

plot_umap(so, "orig.ident")

plot_features_split(so, "orig.ident", group = "condition")
## [[1]]

## 
## [[2]]

## 
## [[3]]

markers

Cluster markers

so <- readRDS(file.path(object_dir, "so.rds"))
Idents(so) <- "coarse_clusters"
mkrs <- FindAllMarkers(so, only.pos = TRUE)

write_tsv(mkrs, 
          file.path(mkrs_dir, "coarse_cluster_markers_all_data.tsv"))
so <- readRDS(file.path(object_dir, "so.rds"))
mkrs <- read_tsv(file.path(mkrs_dir, "coarse_cluster_markers_all_data.tsv"))
## Parsed with column specification:
## cols(
##   p_val = col_double(),
##   avg_logFC = col_double(),
##   pct.1 = col_double(),
##   pct.2 = col_double(),
##   p_val_adj = col_double(),
##   cluster = col_double(),
##   gene = col_character()
## )
topx <- mkrs %>% 
  group_by(cluster) %>% 
  top_n(10, avg_logFC) 
p <- DotPlot(so, 
        features = unique(topx$gene),
        group.by = "coarse_clusters") +
  coord_flip() +
  labs(x = "Cluster",
       y = "")

p

save_plot(file.path(fig_dir, 
                    "coarse_cluster_marker_dotplot.pdf"), 
          p, base_height = 24, base_asp = 1)

Try to assign cell types using cell atlas data

The Tabula Muris is a collection of single cell dataset from 20 organs.

We will use the gene expression profiles from these datasets to try to assign cell types to the identified clusters. The clustifyr package will correlate experession in our clusters to the tabula muris and report the most correlated cell type. It’s unlikely that these will be 100% correct, because I don’t believe that the tabula muris will have all of the cell types in a mouse, and I don’t believe hat they have cells from developing tissues. Nevertheless it will give us a good starting point to annotating the cell types.

library(clustifyr)
library(ComplexHeatmap)
## Loading required package: grid
## ========================================
## ComplexHeatmap version 2.0.0
## Bioconductor page: http://bioconductor.org/packages/ComplexHeatmap/
## Github page: https://github.com/jokergoo/ComplexHeatmap
## Documentation: http://jokergoo.github.io/ComplexHeatmap-reference
## 
## If you use it in published research, please cite:
## Gu, Z. Complex heatmaps reveal patterns and correlations in multidimensional 
##   genomic data. Bioinformatics 2016.
## ========================================
so <- readRDS(file.path(object_dir, "so.rds"))

tm_average <- readRDS("/Users/kriemo/Projects/sc_repos/vanotterloo/data/tabula_muris/TM_avg_expr.rds")

mdata <- get_metadata(so)

res <- clustify(so@assays$RNA@data, 
                tm_average, 
                query_genes = so@assays$RNA@var.features,
                metadata = mdata, 
                cluster_col = "coarse_clusters", 
                compute_method = "spearman")

hmap <- Heatmap(t(res), 
                viridis::viridis(256), 
                "Spearman\ncorrelation",
                row_title = "Cell types from Tabula Muris",
                column_title = "Clusters")
  
pdf(file.path(fig_dir, "cell_type_heatmap.pdf"), height = 12, width = 12)
draw(hmap)
dev.off()
## quartz_off_screen 
##                 2
hmap

Reuse existing annotations

so2 <- readRDS(file.path("objects_2", "so.rds"))

so2_mdata <- get_metadata(so2)
rm(so2)
gc()

so_mdata <- get_metadata(so) %>% 
  select(-(PC_1:UMAP_2))

so2_mdata_simple <- so2_mdata %>% 
  select(cell, cell_types)
combined_mdata <- left_join(so_mdata, so2_mdata_simple, by = "cell")

best_cell_type <- combined_mdata %>% 
  group_by(cell_types, coarse_clusters) %>% 
  summarize(n = n()) %>% 
  group_by(cell_types) %>% 
  summarize(coarse_clusters = nth(coarse_clusters, which(n == max(n))))


so@meta.data <- so_mdata %>% 
  left_join(best_cell_type, by = "coarse_clusters") %>% 
  column_to_rownames("cell")
pclusters <- plot_umap(so, "coarse_clusters")
pcell_types <- plot_umap(so, "cell_types")

plt <- plot_grid(pclusters, pcell_types)
save_plot(file.path(fig_dir,"umap_by_cell_type.pdf"),
          plt, nrow = 1, ncol = 2, base_asp = 1.75)

saveRDS(so, file.path(object_dir, "so.rds"))

Decontamination with SoupX

library(SoupX)

dataDirs <- file.path(data_dir, samples, "outs")
scl <- load10X(dataDirs)
scl <- inferNonExpressedGenes(scl)

map(str_c("Channel", 1:12),
  ~rownames(scl$channels[[.x]]$nonExpressedGenes)[seq(50)]
) %>% 
  unlist() %>% 
  unique() %>% 
  .[!str_detect(., "^Rp[ls]")] %>% 
  .[!str_detect(., "^mt")]

soup_genes <- list(
  HB = c("Hbb-bs", "Hba-a1", "Hba-a2", "Hbb-bt", "H2-Ab1", "H2-Aa", "H2-Ab1", "H2-Eb1"),
  Club = c("Scgb1a1"),
  Bcell = c("Cd74", "Ighg2c", "Igkc", "Ighm", "Ighg3"),
  AEC2 = c("Sftpc"),
  MAC = c("Apoe", "Lyz2", "Psap" ),
  OTHER = c("Ly6c1", "Ly6a" )
)


for(channel in str_c("Channel", 1:12)){
  message(channel)
  scl <- calculateContaminationFraction(scl, channel, soup_genes)
}

for(channel in str_c("Channel", 1:12)){
  message(channel)
  gg = plotChannelContamination(scl, channel)
  plot(gg)
}

for(channel in str_c("Channel", 1:12)){
  message(channel)
  scl = interpolateCellContamination(scl, channel, useGlobal = FALSE)
}

scl <- strainCells(scl)
scl <- adjustCounts(scl)

rename_cols <- names(sample_paths)

names(rename_cols) <- str_c("Channel", 1:12)

count_matrix <- scl$atoc

channels <- colnames(count_matrix) %>% 
  str_match("Channel[0-9]+") %>%
  .[, 1]

new_ids <- rename_cols[channels]

cell_ids <- str_remove(colnames(count_matrix),
                       "Channel[0-9]+___")

new_ids <- str_c(new_ids, "_", cell_ids) %>% 
  str_remove("-[0-9]+$")

colnames(count_matrix) <- new_ids

renormalize/cluster/embed via seurat

count_matrix <- count_matrix[rownames(so), colnames(so)]

cell_types <- so$cell_types
so <- CreateSeuratObject(counts = count_matrix , 
                         names.field = c(1, 2))

so$cell_types <- cell_types
so$condition <- str_split(so$orig.ident, "_", simplify = TRUE) %>% .[, 1] 
so <- NormalizeData(so)

so <- FindVariableFeatures(
  so,
  selection.method = "vst",
  nfeatures = 2000,
  verbose = FALSE
)

so <- ScaleData(so, verbose = TRUE)

so <- RunPCA(so, 
             npcs = 100, 
             verbose = FALSE)

ElbowPlot(so, ndims = 100)

# make graphs and use graphs for UMAP
so <- FindNeighbors(so, 
                      reduction = "pca", 
                      dims = 1:30, 
                      k.param = 20L)

so <- RunUMAP(so, 
              graph = "RNA_snn", 
              min.dist = 0.3)

res <- c(0.05, 0.075, 0.1, 0.3, 0.5)
so <- FindClusters(so, 
                   resolution = res)

map(str_c("RNA_snn_res.", res), 
    ~plot_umap(so, .x)) %>% 
  plot_grid(plotlist = .,
            nrow = 3, 
            ncol = 2)
so$coarse_clusters <- so$RNA_snn_res.0.1

Idents(so) <- "coarse_clusters"

plot_umap(so, "coarse_clusters")
plot_umap(so, "cell_types")
plot_umap(so, "condition")
saveRDS(so, file.path(object_dir, "so_souped.rds"))
to_plot <- c(
  "coarse_clusters",
  "cell_types",
  "condition",
  "orig.ident",
  "Mzb1",
  "Cd19"
) 

plt <- map(to_plot, 
    ~plot_umap(so, .x)) %>% 
  plot_grid(plotlist = ., nrow = 3, ncol = 2,
            rel_widths = c(1.2, 2, 1.2, 1.2, 1.2, 1.2))


save_plot(file.path(fig_dir,
                         paste0("umap_summary_soupx.pdf")),
                plt,
          nrow = 3, 
          ncol = 2,
          base_asp = 1.2)

markers

Cluster markers

so <- readRDS(file.path(object_dir, "so_souped.rds"))

mkrs <- FindAllMarkers(so, only.pos = TRUE)

write_tsv(mkrs, 
          file.path(mkrs_dir, "coarse_cluster_markers_all_data_soupx.tsv"))

Overlay VDJ data

vdj_subdirs <- c(
  "NaiveB6_Bcell",
  "B6_B6_171501_Bcell",
  "HLA_B6_163107_Bcell",
  "HLA_B6_168467_Bcell")

vdj_dirs <- file.path("/Users/kriemo/Projects/sc_repos/eickelberg/data/vdj",
                      vdj_subdirs,
                      "outs")

prefixes <- str_c(names(sample_paths)[9:12], "_")

out <- map2_dfr(vdj_dirs, prefixes, get_clonotypes)

so_tmp <- add_clonotypes(so, 
                      vdj_dirs, 
                      prefixes = prefixes)

so_tmp$frequency <- ifelse(is.na(so_tmp$frequency), 
                           0L,
                           so_tmp$frequency)

so_tmp$proportion <- ifelse(is.na(so_tmp$proportion), 
                           0L,
                           so_tmp$proportion)

so_tmp$clonotype_present <- ifelse(is.na(so_tmp$cdr3s_aa), 
                           "None",
                           "Present")

c_mat <- so_tmp@meta.data %>% 
  tibble::rownames_to_column("cell") %>%
  mutate(clonotype_id = str_c(str_sub(orig.ident, -1), 
                              "-", 
                              raw_clonotype_id)) %>% 
  select(cell, clonotype_id) %>% 
  mutate(present = 1L,
         clonotype_id = ifelse(is.na(clonotype_id), 
                               "no_clonotype", 
                               clonotype_id),
         clonotype_id =  ifelse(clonotype_id == "None", 
                                "no_clonotype", 
                                clonotype_id)) %>% # need to figure out where None came from...
  spread(cell, present, fill = 0L) %>% 
  tibble::column_to_rownames("clonotype_id") %>% 
  as.data.frame() %>% 
  as.matrix()

c_mat <- c_mat[, colnames(so_tmp@assays$RNA@data)]

c_mat <- as(c_mat, "dgCMatrix")

so_tmp@assays[["tcr"]] <- CreateAssayObject(counts = c_mat)
saveRDS(so, file.path(object_dir, "so_souped_w_vdj.rds"))